This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(Metrics)
library(readr)
library(ggplot2)#for visualisation
library(corrplot)#for visualisation of correlation
## corrplot 0.92 loaded
library(mlbench)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)#converting ggplot to plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
library(lattice)
library(caret)
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(caTools)#for splittind data into testing and training data
library(dplyr) #manipulating dataframe
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlbench)
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(corrplot)
library(catboost)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
data("BostonHousing")
housing <- BostonHousing
dim(housing)
## [1] 506 14
str(housing)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
#Check for any NA’s in the dataframe.
missmap(housing,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)
#Checking for NA and missing values and removing them.
numberOfNA <- length(which(is.na(housing)==T))
numberOfNA
## [1] 0
# Remove NA values
housing <- housing %>%
drop_na()
str(housing) #gives the structure of data
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
dim(housing)
## [1] 506 14
#Let’s convert ‘chas’ to numeric
housing$chas <- as.numeric(housing$chas)
str(housing$chas)
## num [1:506] 1 1 1 1 1 1 1 1 1 1 ...
head(housing)#returns the first six rows of data
## crim zn indus chas nox rm age dis rad tax ptratio b lstat
## 1 0.00632 18 2.31 1 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 1 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 1 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 1 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 1 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 1 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
summary(housing) #gives the basic statistics of your dataset like mean, median, 1st quartile, 2nd quartile etc.
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :1.000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:1.000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :1.000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :1.069
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:1.000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :2.000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio b
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
#In the above image we can see that variable ‘crim’ and ‘b’ take wide range of values.
#Variables ‘crim’, ‘zn’, ‘rm’ and ‘black’ have a large difference between their median and mean which indicates lot of outliers in respective variables.
par(mfrow = c(1, 4))
boxplot(housing$crim, main='crim',col='Sky Blue')
boxplot(housing$zn, main='zn',col='Sky Blue')
boxplot(housing$rm, main='rm',col='Sky Blue')
boxplot(housing$b, main='b',col='Sky Blue')
#As suggested earlier variables ‘crim’, ‘zn’, ‘rm’ and ‘black’ do have a lot of outliers.
#Finding correlation
#Correlation is a statistical measure that suggests the level of linear dependence between two variables that occur in pair. Its value lies between -1 to +1
#If above 0 it means positive correlation i.e. X is directly proportional to Y.
#If below 0 it means negative correlation i.e. X is inversly proportional to Y.
#Value 0 suggests weak relation.
#Usually we would use the function ‘cor’ to find correlation between two variables, but since we have 14 variables here, it is easier to examine the correlation between different varables using corrplot function in library ‘corrplot’.
#Correlation plots are a great way of exploring data and seeing the level of interaction between the variables.
corrplot(cor(housing))
#Looking at the plots, we can see that (except for the diagnal):
#Attributes like ‘tax and rad’, ‘nox and tax’, ‘age and indus’ have positive correlation. Larger darker blue dots suggest storng positive relationship.
#Attributes like ‘dis and nox’, ‘dis and indus’, ‘age and dis’ have negative correlation. Larger darker red dots suggest storng negative relationship.
# Highly correlated variables
correlated <- cor(housing)
highCorr <- findCorrelation(correlated, cutoff=0.70)
highCorr
## [1] 3 10 5 13 8
names(housing[highCorr])
## [1] "indus" "tax" "nox" "lstat" "dis"
#Let’s split the loaded dataset into train and test sets. We will use 75% of the data to train our models and 15% will be used to test the models..
set.seed(120)
split <- sample.split(housing,SplitRatio =0.75)
train <- subset(housing,split==TRUE)
test <- subset(housing,split==FALSE)
dim(housing)
## [1] 506 14
dim(train)
## [1] 362 14
dim(test)
## [1] 144 14
summary(train)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.0 Min. : 0.46 Min. :1.000
## 1st Qu.: 0.08276 1st Qu.: 0.0 1st Qu.: 5.13 1st Qu.:1.000
## Median : 0.25636 Median : 0.0 Median : 9.69 Median :1.000
## Mean : 3.52093 Mean : 11.2 Mean :11.05 Mean :1.061
## 3rd Qu.: 3.51983 3rd Qu.: 0.0 3rd Qu.:18.10 3rd Qu.:1.000
## Max. :88.97620 Max. :100.0 Max. :27.74 Max. :2.000
## nox rm age dis
## Min. :0.3890 Min. :3.561 Min. : 2.90 Min. : 1.137
## 1st Qu.:0.4490 1st Qu.:5.895 1st Qu.: 42.45 1st Qu.: 2.076
## Median :0.5380 Median :6.213 Median : 77.75 Median : 3.191
## Mean :0.5535 Mean :6.305 Mean : 68.03 Mean : 3.803
## 3rd Qu.:0.6240 3rd Qu.:6.640 3rd Qu.: 94.10 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio b
## Min. : 1.000 Min. :187.0 Min. :12.6 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.4 1st Qu.:375.24
## Median : 5.000 Median :329.0 Median :19.1 Median :391.70
## Mean : 9.378 Mean :405.8 Mean :18.5 Mean :357.90
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.2 3rd Qu.:396.90
## Max. :24.000 Max. :711.0 Max. :22.0 Max. :396.90
## lstat medv
## Min. : 1.730 Min. : 5.00
## 1st Qu.: 6.728 1st Qu.:16.80
## Median :10.685 Median :21.40
## Mean :12.315 Mean :22.67
## 3rd Qu.:16.402 3rd Qu.:25.27
## Max. :37.970 Max. :50.00
model <- lm(medv ~ lstat , data = train) # fit a simple linear regression model
summary(model)
##
## Call:
## lm(formula = medv ~ lstat, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.071 -4.154 -1.475 2.202 24.629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.61596 0.67468 51.31 <2e-16 ***
## lstat -0.97012 0.04781 -20.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.268 on 360 degrees of freedom
## Multiple R-squared: 0.5335, Adjusted R-squared: 0.5322
## F-statistic: 411.7 on 1 and 360 DF, p-value: < 2.2e-16
pSimpleLineer <- predict(model,test)
postResample(pSimpleLineer,test$medv)
## RMSE Rsquared MAE
## 6.1010062 0.5741069 4.1948060
plLinearSimple <-test %>%
ggplot(aes(medv,pSimpleLineer)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(plLinearSimple)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Multiple Linear Regression
#Lets build our model considering that crim,rm,tax,lstat as the major influencers on the target variable.
model2 <- lm(medv ~ crim + rm + tax + lstat , data = train)
summary(model2)
##
## Call:
## lm(formula = medv ~ crim + rm + tax + lstat, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.977 -3.481 -1.176 1.871 30.418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.110293 3.571261 -0.031 0.9754
## crim -0.081763 0.038589 -2.119 0.0348 *
## rm 5.070090 0.493432 10.275 <2e-16 ***
## tax -0.004923 0.002282 -2.158 0.0316 *
## lstat -0.560441 0.059535 -9.414 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.465 on 357 degrees of freedom
## Multiple R-squared: 0.6483, Adjusted R-squared: 0.6444
## F-statistic: 164.5 on 4 and 357 DF, p-value: < 2.2e-16
model2
##
## Call:
## lm(formula = medv ~ crim + rm + tax + lstat, data = train)
##
## Coefficients:
## (Intercept) crim rm tax lstat
## -0.110293 -0.081763 5.070090 -0.004923 -0.560441
pMultipleLm <- predict(model2,test)
postResample(pMultipleLm,test$medv)
## RMSE Rsquared MAE
## 5.4726430 0.6579667 3.6944289
plotMultipleLineer <-test %>%
ggplot(aes(medv,pMultipleLm)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of Price') +
ylab('Predicted value of Price')+
theme_bw()
ggplotly(plotMultipleLineer)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Remove Highly Correlated Variables and build models
# Create new dataset w/o highly correlated variables
corr_data <- train[,-highCorr]
model3 <- lm(medv ~ . , data = corr_data)
pSimpleLineer3 <- predict(model3,test)
postResample(pSimpleLineer3,test$medv)
## RMSE Rsquared MAE
## 5.3167942 0.6726429 3.3492280
#Cross validation
x <- data.matrix(train[,1:13])
y <- train$medv
control <- trainControl(method = "cv",
number = 10)
lineerCV <- train(medv~.,
data = train,
method = "lm",
trControl = control )
pLineerCV <- predict(lineerCV,test)
postResample(pLineerCV,test$medv)
## RMSE Rsquared MAE
## 4.7853860 0.7395806 3.1963155
#LinearRegression CON CV
plLinearCV <-test %>%
ggplot(aes(medv,pLineerCV)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(plLinearCV)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ridge <- train(medv~.,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0.0001,1,length=50)),
trControl = control )
pRidge <- predict(ridge,test)
postResample(pRidge,test$medv)
## RMSE Rsquared MAE
## 4.7743466 0.7361985 3.0830773
plRidge <-test %>%
ggplot(aes(medv,pRidge)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(plRidge)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
lasso <- train(medv~.,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001,1,length=50)),
trControl = control )
# Test RMSE
pLasso <- predict(lasso,test)
postResample(pLasso,test$medv)
## RMSE Rsquared MAE
## 4.7777320 0.7393834 3.1644723
#Lasoo
plLasso <-test %>%
ggplot(aes(medv,pLasso)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(plLasso)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Separate x and y of train and test dataset, which will very useful when we using this in the catboost package.
y_train <- unlist(train[c('medv')])
X_train <- train %>% select(-medv)
y_valid <- unlist(test[c('medv')])
X_valid <- test %>% select(-medv)
#Convert the train and test dataset to catboost specific format using the load_pool function by mentioning x and y of both train and test.
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_valid, label = y_valid)
#Create an input params for the CatBoost regression.
params <- list(iterations=500,
learning_rate=0.01,
depth=10,
loss_function='RMSE',
eval_metric='RMSE',
random_seed = 55,
od_type='Iter',
metric_period = 50,
od_wait=20,
use_best_model=TRUE)
#Iterations- The maximum number of trees that can be built when solving machine learning problems.
#Build a model using the catboost train function. Pass the train dataset and parameters to the catboost train function.
modelCatboost <- catboost.train(learn_pool = train_pool,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0: learn: 9.0969322 total: 164ms remaining: 1m 21s
## 50: learn: 7.0396136 total: 957ms remaining: 8.42s
## 100: learn: 5.6154938 total: 1.66s remaining: 6.58s
## 150: learn: 4.6142807 total: 2.39s remaining: 5.52s
## 200: learn: 3.8905746 total: 3.14s remaining: 4.67s
## 250: learn: 3.3397315 total: 3.87s remaining: 3.84s
## 300: learn: 2.9277197 total: 4.59s remaining: 3.04s
## 350: learn: 2.6162423 total: 5.35s remaining: 2.27s
## 400: learn: 2.3565605 total: 6.1s remaining: 1.51s
## 450: learn: 2.1541226 total: 6.85s remaining: 744ms
## 499: learn: 1.9844896 total: 7.59s remaining: 0us
#predict
y_pred=catboost.predict(modelCatboost,test_pool)
#calculate error metrics
catboostMetrics <- postResample(y_pred,test$medv)
catboostMetrics
## RMSE Rsquared MAE
## 3.6521302 0.8772542 2.3072133
#Catboost
plCatboost <-test %>%
ggplot(aes(medv,y_pred)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(plCatboost)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Catboost without correlated variables
corr_dataTEST <- test[,-highCorr]
#corr_data
train_pool2 <- catboost.load_pool(data = corr_data, label = y_train)
test_pool2 <- catboost.load_pool(data = corr_dataTEST, label = y_valid)
#Build a model
modelCatboost2 <- catboost.train(learn_pool = train_pool2,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0: learn: 9.0952492 total: 13.5ms remaining: 6.75s
## 50: learn: 6.6649525 total: 568ms remaining: 5s
## 100: learn: 4.9337798 total: 1.1s remaining: 4.36s
## 150: learn: 3.7255671 total: 1.7s remaining: 3.92s
## 200: learn: 2.8594591 total: 2.21s remaining: 3.29s
## 250: learn: 2.2309073 total: 2.77s remaining: 2.75s
## 300: learn: 1.7771642 total: 3.31s remaining: 2.19s
## 350: learn: 1.4343749 total: 3.87s remaining: 1.64s
## 400: learn: 1.1869029 total: 4.44s remaining: 1.09s
## 450: learn: 0.9949640 total: 4.97s remaining: 540ms
## 499: learn: 0.8452080 total: 5.47s remaining: 0us
#predict
y_pred2=catboost.predict(modelCatboost2,test_pool2)
catboostMetrics2 <- postResample(y_pred2,test$medv)
catboostMetrics2
## RMSE Rsquared MAE
## 2.1770161 0.9638678 0.9684344
#plot
plCatboost2 <-test %>%
ggplot(aes(medv,y_pred2)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(plCatboost2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'